import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import numbers
import os
import plotly
import matplotlib.pyplot as plt
import scvelo as scv
from tqdm.notebook import tqdm
import itertools
import random
import seaborn as sns
import rpy2
import rpy2.robjects as ro
warnings.filterwarnings('ignore')
%load_ext rpy2.ipython
%%R
library(edgeR)
library(org.Hs.eg.db)
library(AnnotationDbi)
library(stats)
R[write to console]: Loading required package: limma
R[write to console]: Loading required package: AnnotationDbi
R[write to console]: Loading required package: stats4
R[write to console]: Loading required package: BiocGenerics
R[write to console]: Loading required package: parallel
R[write to console]:
Attaching package: ‘BiocGenerics’
R[write to console]: The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
R[write to console]: The following object is masked from ‘package:limma’:
plotMA
R[write to console]: The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
R[write to console]: The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames,
dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
union, unique, unsplit, which.max, which.min
R[write to console]: Loading required package: Biobase
R[write to console]: Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
R[write to console]: Loading required package: IRanges
R[write to console]: Loading required package: S4Vectors
R[write to console]:
Attaching package: ‘S4Vectors’
R[write to console]: The following object is masked from ‘package:base’:
expand.grid
R[write to console]:
%matplotlib inline
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
plotly.offline.init_notebook_mode()
%matplotlib inline
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (9, 9)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
hostRoot = "-".join(socket.gethostname().split('-')[0:2])
with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
paths = yaml.load(f, Loader=yaml.FullLoader)
#indir=paths["paths"]["indir"][hostRoot]
FinaLeaf="/Neurons"
outdir="./outdir/"
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
with open("./colorMap.yaml", 'r') as f:
colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
{'medial': {'color': '#CD5C5C'},
'distal': {'color': '#FFCBCB'},
'proximal': {'color': '#8D021F'},
'piece1': {'color': '#281E5D'},
'piece2': {'color': '#3779FF'},
'piece3': {'color': '#BFD4FF'},
'control': {'color': '#0056D1'},
'polaroid': {'color': '#DE001E'},
'enriched': {'color': '#DE001E'},
'not_enriched': {'color': '#0056D1'},
'pfc': {'color': '#DE001E'},
'somatosensory': {'color': '#E5E4E2'},
'temporal': {'color': '#0056D1'},
'motor': {'color': '#37F7C8'},
'v1': {'color': '#28F30C'},
'parietal': {'color': '#D41FFC'}}
adata = sc.read_h5ad(outdir+FinaLeaf+"/4A_Neurons_DA.h5ad")
adataraw = sc.read_h5ad(outdir+"/1_polaroid_mint.h5ad")[adata.obs_names]
adataraw.obs = adata.obs
adata = adataraw
sc.pp.normalize_total(adata)
normalizing counts per cell
finished (0:00:00)
adata.obs
| dataset | organoid | region | type | type_region | regionContrast | n_genes_by_counts | log1p_n_genes_by_counts | total_counts | log1p_total_counts | ... | leiden0.3 | S_score | G2M_score | phase | subLeiden | umap_density_type | umap_density_region | umap_density_organoid | DiffAbundant | leiden_partition_final | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACGAACATTCAGCA-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 3191 | 8.068403 | 7746.0 | 8.955061 | ... | 2 | -0.128590 | -0.127933 | G1 | 1 | 0.773493 | 0.685443 | 0.741122 | 0 | not_enriched |
| AACAAGATCGAACCTA-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 2411 | 7.788212 | 5934.0 | 8.688622 | ... | 2 | -0.174303 | -0.211261 | G1 | 1 | 0.243025 | 0.190615 | 0.295339 | 0 | not_enriched |
| AACCAACAGGACTTCT-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 942 | 6.849066 | 1343.0 | 7.203405 | ... | 2 | -0.158302 | -0.146888 | G1 | 1 | 0.585016 | 0.606536 | 0.665529 | 0 | not_enriched |
| AACCACAAGGATGTTA-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 1700 | 7.438972 | 3382.0 | 8.126518 | ... | 2 | -0.017421 | -0.150694 | G1 | 0 | 0.280893 | 0.337840 | 0.210278 | 0 | not_enriched |
| AACCTGAAGTCAGGGT-1_control3_piece2 | control3_piece2 | control3 | piece2 | control | control_piece2 | control | 2189 | 7.691657 | 4256.0 | 8.356320 | ... | 2 | -0.099218 | -0.105115 | G1 | 0 | 0.657916 | 0.486592 | 0.829882 | 0 | not_enriched |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTGCATTAGGGCAATC-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 4158 | 8.333030 | 12796.0 | 9.456966 | ... | 8 | -0.159563 | -0.157904 | G1 | 4 | 0.853591 | 0.693369 | 0.787649 | 1 | enriched |
| TTGTGTTAGATCGACG-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 2715 | 7.906915 | 5878.0 | 8.679142 | ... | 8 | -0.108142 | -0.146919 | G1 | 2 | 0.887941 | 0.689022 | 0.919148 | 1 | enriched |
| TTGTTCACATCAGTCA-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 1589 | 7.371489 | 4025.0 | 8.300529 | ... | 8 | -0.085110 | 0.002994 | G2M | 4 | 0.453936 | 0.387748 | 0.400370 | 1 | enriched |
| TTTGATCGTATGATCC-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 4502 | 8.412499 | 16372.0 | 9.703389 | ... | 2 | -0.175276 | -0.111532 | G1 | 1 | 0.066308 | 0.021418 | 0.063423 | 0 | not_enriched |
| TTTGGAGTCGTCAAAC-1_polaroid3_proximal | polaroid3_proximal | polaroid3 | proximal | polaroid | polaroid_proximal | proximal | 3663 | 8.206311 | 9542.0 | 9.163563 | ... | 8 | -0.123037 | -0.117675 | G1 | 4 | 0.787412 | 0.870449 | 0.255348 | 1 | enriched |
3650 rows × 29 columns
adata.write_h5ad(outdir+FinaLeaf+"/5A_Neurons_preBulk.h5ad")
groupingCovariate = "region"
PseudooReplicates_per_group = 10
print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
adata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 10 PRs will result in following cells per PR
region distal 48.8 medial 27.4 piece1 58.6 piece2 54.5 piece3 64.4 proximal 111.3 dtype: float64
total = pd.DataFrame(index = adata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in list(itertools.product(adata.obs[groupingCovariate].cat.categories.tolist(), [str(r) for r in list(range(PseudooReplicates_per_group))]))])
for group in adata.obs[groupingCovariate].cat.categories:
groupAdata = adata[adata.obs[groupingCovariate] == group]
group_cells = groupAdata.obs_names.tolist()
random.Random(4).shuffle(group_cells)
metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
for partition in enumerate(metaCellslist):
total['{}_{}'.format(group, partition[0])] = adata[partition[1]].X.sum(axis = 0).A1
total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
#With this line we propagate - whenever possible - obs to aggregated pReplicate
for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r) for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = adata.obs.loc[adata.obs[groupingCovariate] == group,obsMD][0]
total_metadata = total_metadata.dropna(axis = 1)
adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = adata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)
adata_bulk.obs["group"] =adata_bulk.obs["group"].astype("category")
with open("./colorMap.yaml", 'r') as f:
colorMap = yaml.load(f, Loader=yaml.FullLoader)["uns_colors"]
colorMap
adata_bulk.uns["group_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["group"].cat.categories]
adata_bulk.obs["regionContrast"] = adata_bulk.obs["regionContrast"].astype("category")
adata_bulk.uns["regionContrast_colors"] = [colorMap[i]["color"] for i in adata_bulk.obs["regionContrast"].cat.categories]
adata_bulk.write_h5ad(outdir+FinaLeaf+"/5A_Neurons_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.h5ad")
total.to_csv(outdir+FinaLeaf+"/5A_Neurons_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.tsv", sep="\t")
... storing 'type' as categorical ... storing 'type_region' as categorical ... storing 'is.Stressed' as categorical
adata_bulk
AnnData object with n_obs × n_vars = 60 × 33542
obs: 'group', 'pseudoreplicate', 'number_of_cell', 'type', 'type_region', 'regionContrast', 'is.Stressed'
var: 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
uns: 'group_colors', 'regionContrast_colors'
import ipynbname
nb_fname = ipynbname.name()
%%bash -s "$nb_fname"
sleep 120
jupyter nbconvert "$1".ipynb --to="python" --ClearOutputPreprocessor.enabled=True
jupyter nbconvert "$1".ipynb --to="html"
[NbConvertApp] Converting notebook 05A_Neurons_pBulk.ipynb to python [NbConvertApp] Writing 8073 bytes to 05A_Neurons_pBulk.py [NbConvertApp] Converting notebook 05A_Neurons_pBulk.ipynb to html [NbConvertApp] Writing 4300226 bytes to 05A_Neurons_pBulk.html